#include "BaseMemoryLib.h"//For CopyMem
#include "SHA1.h"
#include "HelperAsm.h"
/*Secure Hash Algorithm*/
SHA1_VAR gSHA1Var;
UINT32 VarW16_79[80];
void Sha1(UINT8 *pMsg, UINT32 MsgLen, UINT32 *pHashOut)
{
	UINT32 *pu32Temp = NULL;
	UINT32 u32Indx = 0;
	HashTarget(pMsg, MsgLen);
	pu32Temp = &gSHA1Var;
	for(u32Indx = 0; u32Indx < 5; u32Indx++){
		ByteSwapDWord(pu32Temp); //we have got the hash in big-endian, lets change it to little endian
		*pHashOut++ = *pu32Temp++;
	}
}

void HashTarget(UINT8 *pMsg, UINT32 MsgLen)
{
	UINT32 u32BlkCount = 0;
	UINT32 u32TailLen = 0;
	UINT32 u32DWsInTail = 0;
	UINT32 u32BytesInTail= 0;
	UINT32 u32Indx = 0;
	/*These two buffers below are to help to append k bits '0' to the message, 
	where k is the minimum number >= 0 such that the resulting message length (in bits) is congruent to 448 (mod 512)
	*/
	UINT32 PadBuf0[16];
	UINT32 PadBuf1[16];

	UINT8 *pMsgBase = pMsg;	
	UINT8 *pu8Temp = NULL;
	UINT32 u32Temp = MsgLen;

	u32BlkCount = MsgLen / 64;
	u32TailLen = MsgLen % 64;
	/*Initialize the hash with initial value*/
	gSHA1Var.VarH0 = CONST_H0;
	gSHA1Var.VarH1 = CONST_H1;
	gSHA1Var.VarH2 = CONST_H2;
	gSHA1Var.VarH3 = CONST_H3;
	gSHA1Var.VarH4 = CONST_H4;

	if(0 != u32BlkCount){
		for(u32Indx = 0; u32Indx < u32BlkCount; u32Indx++){
			Sha1Block(pMsgBase);
			pMsgBase += 64;
		}
	}
	//Note: pMsgBase is pointing to the remaining bytes (incomplete Block)
	/*
	Handle tail
	Need to pad message. 
	Zap both buffers on stack - some execution overhead but simplifies logic
	*/
	ZeroMem(PadBuf0,16*4);
	ZeroMem(PadBuf1,16*4);
	
	u32DWsInTail = u32TailLen / 4;
	u32BytesInTail = u32TailLen % 4;
	for(u32Indx = 0; u32Indx < u32DWsInTail; u32Indx++){
		//PadBuf0[u32Indx] = *((UINT32 *)pMsgBase);//Lakshman: sometimes it was getting optimized to use intrinsic memcpy
		CopyMem(&PadBuf0[u32Indx],pMsgBase, 4);
		pMsgBase += 4;
	}

	pu8Temp = (UINT8 *)&PadBuf0[u32DWsInTail]; //last DWord in PadBuf00 to keep remaining bytes
	for(u32Indx = 0; u32Indx < u32BytesInTail; u32Indx++){
		//pu8Temp[u32Indx] = *pMsgBase;//Lakshman: sometimes it was getting optimized to use intrinsic memcpy
		CopyMem(&pu8Temp[u32Indx],pMsgBase,1);
		pMsgBase++;
	}
	/*Append 1 to Message, There is always atleast 1 byte available in the buffer*/
	pu8Temp[u32Indx] = 0x80;

	/*	Prepare message length to store
		Code assumes max message length 536,870,911 (0x1FFFFFFF)bytes
	*/
	u32Temp = MsgLen;	//Lenght in Bytes
	u32Temp *= 8;		//Length in Bits
	ByteSwapDWord(&u32Temp);//To Big endian

	if(u32TailLen <= (64 - 9)){//byte with 1 plus 8 bytes for length
		PadBuf0[15] = u32Temp;	
	}
	else{
		PadBuf1[15] = u32Temp;
	}
	//Hash buffer 0

	Sha1Block((UINT8 *)PadBuf0);

	if(u32TailLen > (64 - 9)){
		Sha1Block((UINT8 *)PadBuf1);
	}
}

void Sha1Block(UINT8 *pMsg)
{
	UINT32 u32Indx = 0;
	UINT32 u32EDI = 0;
	UINT32 u32EAX = 0;
	UINT32 u32EDX = 0;
	UINT32 u32Temp = 0;
	UINT32 u32Temp1 = 0;
	UINT32 u32TempVarB = 0;
	UINT32 *pu32Temp = NULL;
	UINT8 *pu8VarW16_79_Base = (UINT8 *)VarW16_79;
	
	for(u32Indx = 0; u32Indx < 80; u32Indx++){
		/*break chunk into sixteen 32-bit big-endian words w[i], 0 <= i <= 15*/
		if(u32Indx < 16){ //for i from 0 to 15
			u32Temp = *((UINT32 *)(pMsg + (u32Indx * 4)));
			ByteSwapDWord(&u32Temp);
						
			VarW16_79[u32Indx] = u32Temp;
		}
		else{
			/*Extend each of the sixteen 32-bit words into eighty (including previous 16) 32-bit words
				for i from 16 to 79
				w[i] = (w[i-3] xor w[i-8] xor w[i-14] xor w[i-16]) leftrotate 1
			*/
			u32Temp = VarW16_79[u32Indx - 3] ^ VarW16_79[u32Indx - 8]; //w[i-3] xor w[i-8]
			u32Temp = u32Temp ^ VarW16_79[u32Indx - 14]; //w[i-3] xor w[i-8] xor w[i-14]
			u32Temp = u32Temp ^ VarW16_79[u32Indx - 16]; //w[i-3] xor w[i-8] xor w[i-14] xor w[i-16]
			
			RotateLeft(&u32Temp,1); //(w[i-3] xor w[i-8] xor w[i-14] xor w[i-16]) leftrotate 1
			VarW16_79[u32Indx] = u32Temp;
		}
	}
	
	//Initialize Hash value for this chunk
	gSHA1Var.VarA = gSHA1Var.VarH0;
	gSHA1Var.VarB = gSHA1Var.VarH1;
	gSHA1Var.VarC = gSHA1Var.VarH2;
	gSHA1Var.VarD = gSHA1Var.VarH3;
	gSHA1Var.VarE = gSHA1Var.VarH4;

	for(u32Indx = 0; u32Indx < 80; u32Indx++){
		if(u32Indx <= 19){
			u32Temp = F0_19(gSHA1Var.VarB, gSHA1Var.VarC, gSHA1Var.VarD);//f = (b and c) or ((not b) and d)
			u32Temp = u32Temp + CONST_K0_19; //k = 0x5A827999
		}
		else if(u32Indx <= 39){
			u32Temp = F20_39(gSHA1Var.VarB, gSHA1Var.VarC, gSHA1Var.VarD);//f = b xor c xor d
			u32Temp = u32Temp + CONST_K20_39; //k = 0x6ED9EBA1
		}
		else if(u32Indx <= 59){
			u32Temp = F40_59(gSHA1Var.VarB, gSHA1Var.VarC, gSHA1Var.VarD);//f = (b and c) or (b and d) or (c and d)
			u32Temp = u32Temp + CONST_K40_59;//k = 0x8F1BBCDC
		}
		else{
			u32Temp = F60_79(gSHA1Var.VarB, gSHA1Var.VarC, gSHA1Var.VarD);//f = b xor c xor d
			u32Temp = u32Temp + CONST_K60_79;//k = 0xCA62C1D6
		}
		//u32Temp = f + k + e + w[i]
		u32Temp = u32Temp + gSHA1Var.VarE + VarW16_79[u32Indx];

		u32Temp1 = gSHA1Var.VarA;
		RotateLeft(&u32Temp1,5); //(a leftrotate 5)
		u32Temp = u32Temp + u32Temp1; //u32Temp = f + k + e + w[i] + (a leftrotate 5)

		gSHA1Var.VarE = gSHA1Var.VarD;
		gSHA1Var.VarD = gSHA1Var.VarC;
		u32TempVarB = gSHA1Var.VarB;
		
		RotateLeft(&u32TempVarB, 30);
		gSHA1Var.VarC = u32TempVarB;
		gSHA1Var.VarB = gSHA1Var.VarA;
		gSHA1Var.VarA = u32Temp;
	}
	
	/*Comput Digest, this Hash is in big-endian*/
	gSHA1Var.VarH0 += gSHA1Var.VarA;
	gSHA1Var.VarH1 += gSHA1Var.VarB;
	gSHA1Var.VarH2 += gSHA1Var.VarC;
	gSHA1Var.VarH3 += gSHA1Var.VarD;
	gSHA1Var.VarH4 += gSHA1Var.VarE;

}
